/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
/*
 * The MIT License (MIT)
 * Copyright (c) 2014 Martin Kleppmann
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package org.apache.hadoop.hbase.test.util.warc;

import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.regex.Pattern;

/**
 * Immutable implementation of a record in a WARC file. You create a {@link WARCRecord}
 * by parsing it out of a {@link DataInput} stream.
 *
 * The file format is documented in the
 * [ISO Standard](http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf).
 * In a nutshell, it's a textual format consisting of lines delimited by `\r\n`.
 * Each record has the following structure:
 *
 * 1. A line indicating the WARC version number, such as `WARC/1.0`.
 * 2. Several header lines (in key-value format, similar to HTTP or email headers),
 *    giving information about the record. The header is terminated by an empty line.
 * 3. A body consisting of raw bytes (the number of bytes is indicated in one of the headers).
 * 4. A final separator of `\r\n\r\n` before the next record starts.
 *
 * There are various different types of records, as documented on
 * {@link Header#getRecordType()}.
 */
public class WARCRecord {

  public static final String WARC_VERSION = "WARC/1.0";
  private static final Pattern VERSION_PATTERN = Pattern.compile("WARC/[0-9\\.]+");
  private static final Pattern CONTINUATION_PATTERN = Pattern.compile("^[\\t ]+.*");
  private static final String CRLF = "\r\n";
  private static final byte[] CRLF_BYTES = { 13, 10 };

  private final Header header;
  private final byte[] content;

  /**
   * Creates a new WARCRecord by parsing it out of a {@link DataInput} stream.
   * @param in The input source from which one record will be read.
   */
  public WARCRecord(DataInput in) throws IOException {
    header = readHeader(in);
    content = new byte[header.getContentLength()];
    in.readFully(content);
    readSeparator(in);
  }

  private static Header readHeader(DataInput in) throws IOException {
    String versionLine = readLine(in);
    if (!VERSION_PATTERN.matcher(versionLine).matches()) {
      throw new IllegalStateException("Expected WARC version, but got: " + versionLine);
    }

    LinkedHashMap<String, String> headers = new LinkedHashMap<String, String>();
    String line, fieldName = null;

    do {
      line = readLine(in);
      if (fieldName != null && CONTINUATION_PATTERN.matcher(line).matches()) {
        headers.put(fieldName, headers.get(fieldName) + line);
      } else if (!line.isEmpty()) {
        String[] field = line.split(":", 2);
        if (field.length < 2) {
          throw new IllegalStateException("Malformed header line: " + line);
        }
        fieldName = field[0].trim();
        headers.put(fieldName, field[1].trim());
      }
    } while (!line.isEmpty());

    return new Header(headers);
  }

  private static String readLine(DataInput in) throws IOException {
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    boolean seenCR = false, seenCRLF = false;
    while (!seenCRLF) {
      byte b = in.readByte();
      if (!seenCR && b == 13) {
        seenCR = true;
      } else if (seenCR && b == 10) {
        seenCRLF = true;
      } else {
        seenCR = false;
        out.write(b);
      }
    }
    return out.toString("UTF-8");
  }

  private static void readSeparator(DataInput in) throws IOException {
    byte[] sep = new byte[4];
    in.readFully(sep);
    if (sep[0] != 13 || sep[1] != 10 || sep[2] != 13 || sep[3] != 10) {
      throw new IllegalStateException(String.format(
        "Expected final separator CR LF CR LF, but got: %d %d %d %d",
        sep[0], sep[1], sep[2], sep[3]));
    }
  }

  /**
   * Returns the parsed header structure of the WARC record.
   */
  public Header getHeader() {
    return header;
  }

  /**
   * Returns the body of the record, as an unparsed raw array of bytes. The content
   * of the body depends on the type of record (see {@link Header#getRecordType()}).
   * For example, in the case of a `response` type header, the body consists of the
   * full HTTP response returned by the server (HTTP headers followed by the body).
   */
  public byte[] getContent() {
    return content;
  }

  /**
   * Writes this record to a {@link DataOutput} stream. The output may, in some edge
   * cases, be not byte-for-byte identical to what was parsed from a {@link DataInput}.
   * However it has the same meaning and should not lose any information.
   * @param out The output stream to which this record should be appended.
   */
  public void write(DataOutput out) throws IOException {
    header.write(out);
    out.write(CRLF_BYTES);
    out.write(content);
    out.write(CRLF_BYTES);
    out.write(CRLF_BYTES);
  }

  /**
   * Returns a human-readable string representation of the record.
   */
  @Override
  public String toString() {
    return header.toString();
  }

  /**
   * Contains the parsed headers of a {@link WARCRecord}. Each record contains a number
   * of headers in key-value format, where some header keys are standardised, but
   * nonstandard ones can be added.
   *
   * The documentation of the methods in this class is excerpted from the
   * [WARC 1.0 specification](http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf).
   * Please see the specification for more detail.
   */
  public final static class Header {
    private final Map<String, String> fields;

    private Header(Map<String, String> fields) {
      this.fields = fields;
    }

    /**
     * Returns the type of WARC record (the value of the `WARC-Type` header field).
     * WARC 1.0 defines the following record types: (for full definitions, see the
     * [spec](http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf))
     *
     *  *  `warcinfo`: Describes the records that follow it, up through end of file,
     *     end of input, or until next `warcinfo` record. Typically, this appears once and
     *     at the beginning of a WARC file. For a web archive, it often contains information
     *     about the web crawl which generated the following records.
     *
     *     The format of this descriptive record block may vary, though the use of the
     *     `"application/warc-fields"` content-type is recommended. (...)
     *
     *  *  `response`: The record should contain a complete scheme-specific response, including
     *     network protocol information where possible. For a target-URI of the `http` or
     *     `https` schemes, a `response` record block should contain the full HTTP
     *     response received over the network, including headers. That is, it contains the
     *     'Response' message defined by section 6 of HTTP/1.1 (RFC2616).
     *
     *     The WARC record's Content-Type field should contain the value defined by HTTP/1.1,
     *     `"application/http;msgtype=response"`. The payload of the record is defined as its
     *     'entity-body' (per RFC2616), with any transfer-encoding removed.
     *
     *  *  `resource`: The record contains a resource, without full protocol response
     *     information. For example: a file directly retrieved from a locally accessible
     *     repository or the result of a networked retrieval where the protocol information
     *     has been discarded. For a target-URI of the `http` or `https` schemes, a `resource`
     *     record block shall contain the returned 'entity-body' (per RFC2616, with any
     *     transfer-encodings removed), possibly truncated.
     *
     *  *  `request`: The record holds the details of a complete scheme-specific request,
     *     including network protocol information where possible. For a target-URI of the
     *     `http` or `https` schemes, a `request` record block should contain the full HTTP
     *     request sent over the network, including headers. That is, it contains the
     *     'Request' message defined by section 5 of HTTP/1.1 (RFC2616).
     *
     *     The WARC record's Content-Type field should contain the value defined by HTTP/1.1,
     *     `"application/http;msgtype=request"`. The payload of a `request` record with a
     *     target-URI of scheme `http` or `https` is defined as its 'entity-body' (per
     *     RFC2616), with any transfer-encoding removed.
     *
     *  *  `metadata`: The record contains content created in order to further describe,
     *     explain, or accompany a harvested resource, in ways not covered by other record
     *     types. A `metadata` record will almost always refer to another record of another
     *     type, with that other record holding original harvested or transformed content.
     *
     *     The format of the metadata record block may vary. The `"application/warc-fields"`
     *     format may be used.
     *
     *  *  `revisit`: The record describes the revisitation of content already archived,
     *     and might include only an abbreviated content body which has to be interpreted
     *     relative to a previous record. Most typically, a `revisit` record is used
     *     instead of a `response` or `resource` record to indicate that the content
     *     visited was either a complete or substantial duplicate of material previously
     *     archived.
     *
     *     A `revisit` record shall contain a WARC-Profile field which determines the
     *     interpretation of the record's fields and record block. Please see the
     *     specification for details.
     *
     *  *  `conversion`: The record shall contain an alternative version of another
     *     record's content that was created as the result of an archival process.
     *     Typically, this is used to hold content transformations that maintain viability
     *     of content after widely available rendering tools for the originally stored
     *     format disappear. As needed, the original content may be migrated (transformed)
     *     to a more viable format in order to keep the information usable with current
     *     tools while minimizing loss of information.
     *
     *  *  `continuation`: Record blocks from `continuation` records must be appended to
     *     corresponding prior record blocks (eg. from other WARC files) to create the
     *     logically complete full-sized original record. That is, `continuation`
     *     records are used when a record that would otherwise cause a WARC file size to
     *     exceed a desired limit is broken into segments. A continuation record shall
     *     contain the named fields `WARC-Segment-Origin-ID` and `WARC-Segment-Number`,
     *     and the last `continuation` record of a series shall contain a
     *     `WARC-Segment-Total-Length` field. Please see the specification for details.
     *
     *  *  Other record types may be added in future, so this list is not exclusive.
     *
     * @return The record's `WARC-Type` header field, as a string.
     */
    public String getRecordType() {
      return fields.get("WARC-Type");
    }

    /**
     * A 14-digit UTC timestamp formatted according to YYYY-MM-DDThh:mm:ssZ, described
     * in the W3C profile of ISO8601. The timestamp shall represent the instant that
     * data capture for record creation began. Multiple records written as part of a
     * single capture event shall use the same WARC-Date, even though the times of
     * their writing will not be exactly synchronized.
     *
     * @return The record's `WARC-Date` header field, as a string.
     */
    public String getDateString() {
      return fields.get("WARC-Date");
    }

    /**
     * An identifier assigned to the current record that is globally unique for its
     * period of intended use. No identifier scheme is mandated by this specification,
     * but each record-id shall be a legal URI and clearly indicate a documented and
     * registered scheme to which it conforms (e.g., via a URI scheme prefix such as
     * `http:` or `urn:`).
     *
     * @return The record's `WARC-Record-ID` header field, as a string.
     */
    public String getRecordID() {
      return fields.get("WARC-Record-ID");
    }

    /**
     * The MIME type (RFC2045) of the information contained in the record's block. For
     * example, in HTTP request and response records, this would be `application/http`
     * as per section 19.1 of RFC2616 (or `application/http; msgtype=request` and
     * `application/http; msgtype=response` respectively).
     *
     * In particular, the content-type is *not* the value of the HTTP Content-Type
     * header in an HTTP response, but a MIME type to describe the full archived HTTP
     * message (hence `application/http` if the block contains request or response
     * headers).
     *
     * @return The record's `Content-Type` header field, as a string.
     */
    public String getContentType() {
      return fields.get("Content-Type");
    }

    /**
     * The original URI whose capture gave rise to the information content in this record.
     * In the context of web harvesting, this is the URI that was the target of a
     * crawler's retrieval request. For a `revisit` record, it is the URI that was the
     * target of a retrieval request. Indirectly, such as for a `metadata`, or `conversion`
     * record, it is a copy of the `WARC-Target-URI` appearing in the original record to
     * which the newer record pertains. The URI in this value shall be properly escaped
     * according to RFC3986, and written with no internal whitespace.
     *
     * @return The record's `WARC-Target-URI` header field, as a string.
     */
    public String getTargetURI() {
      return fields.get("WARC-Target-URI");
    }

    /**
     * The number of bytes in the body of the record, similar to RFC2616.
     *
     * @return The record's `Content-Length` header field, parsed into an int.
     */
    public int getContentLength() {
      String lengthStr = fields.get("Content-Length");
      if (lengthStr == null) {
        throw new IllegalStateException("Missing Content-Length header");
      }
      try {
        return Integer.parseInt(lengthStr);
      } catch (NumberFormatException e) {
        throw new IllegalStateException("Malformed Content-Length header: " + lengthStr);
      }
    }

    /**
     * Returns the value of a selected header field, or null if there is no header with
     * that field name.
     * @param field The name of the header to return (case-sensitive).
     * @return The value associated with that field name, or null if not present.
     */
    public String getField(String field) {
      return fields.get(field);
    }

    /**
     * Appends this header to a {@link DataOutput} stream, in WARC/1.0 format.
     * @param out The data output to which the header should be written.
     */
    public void write(DataOutput out) throws IOException {
      out.write(toString().getBytes("UTF-8"));
    }

    /**
     * Formats this header in WARC/1.0 format, consisting of a version line followed
     * by colon-delimited key-value pairs, and `\r\n` line endings.
     */
    @Override
    public String toString() {
      StringBuilder buf = new StringBuilder();
      buf.append(WARC_VERSION);
      buf.append(CRLF);
      for (Map.Entry<String, String> field : fields.entrySet()) {
        buf.append(field.getKey());
        buf.append(": ");
        buf.append(field.getValue());
        buf.append(CRLF);
      }
      return buf.toString();
    }
  }

}
